When removing the gene ENSG00000187951 there was an important change in the WGCNA modules, and when analysing what could have caused this, we found that this gene had a very big outlier value for a single sample, which we think altered the values of the other genes during the vst transformation enough to alter the modules by the WGNCA algorithm (See 20_02_28_comparison.html for more details about this).
Since the preprocessing pipeline did not catch this behaviour and it ended up causing problems in downstream analysis, we want to see how often we can find these type of behaviours in the data and if/how much they alter the results of WGCNA modules.
To do this, for each gene, we’ll calculate its variance leaving one sample out, repeating this for all samples and then calculating the variance of these estimates. A high variance would point to the existance of an outlier value.
I’ll use the filtered raw data obtained with the 20_02_06_data_preprocessing.html pipeline, where genes with low levels of expression as well are samples with weird behaviours are supposed to have been filtered out already
load('./../../AllRegions/Data/filtered_raw_data_old.RData')
datExpr_raw = datExpr
datGenes_raw = datGenes
load('./../../AllRegions/Data/preprocessed_data.RData')
DE_info = DE_info %>% data.frame %>% mutate('ID' = rownames(datExpr), 'DE' = padj<0.05) %>%
dplyr::select(ID, log2FoldChange, padj, DE)
datExpr = datExpr_raw
datGenes = datGenes_raw
rm(datExpr_raw, dds)
sd_leave1out_sd_by_gene = data.frame('ID'=rownames(datExpr),
'sd_l1o'=apply(datExpr, 1, function(x){sd(sapply(1:ncol(datExpr), function(s) sd(x[-s])))})) %>%
left_join(DE_info, by = 'ID')
ENSG00000187951_val = sd_leave1out_sd_by_gene[sd_leave1out_sd_by_gene$ID == 'ENSG00000187951',2]
cat(paste0('Gene ENSG00000187951 has a value of ', round(ENSG00000187951_val,2)))
## Gene ENSG00000187951 has a value of 16.62
cat(paste0('There are ', sum(sd_leave1out_sd_by_gene$sd_l1o>ENSG00000187951_val), ' genes with a value higher than this (~',
round(100*mean(sd_leave1out_sd_by_gene$sd_l1o>ENSG00000187951_val),1),'%)'))
## There are 497 genes with a value higher than this (~3.1%)
summary(sd_leave1out_sd_by_gene$sd_l1o)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.008 0.398 1.075 3.758 2.802 4226.243
p = sd_leave1out_sd_by_gene %>% ggplot(aes(ID, sd_l1o+1, color = DE)) + geom_point(alpha=0.3) +
geom_hline(yintercept = ENSG00000187951_val, color='gray') +
xlab('Genes') + ylab('') + scale_y_log10() + theme_minimal() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(),
panel.grid.major = element_blank())
ggExtra::ggMarginal(p, type = 'density', margins='y', groupColour=TRUE, groupFill=TRUE, size=10)
Top genes
top_genes = sd_leave1out_sd_by_gene %>% arrange(desc(sd_l1o)) %>% top_n(n=21, w=sd_l1o) %>%
left_join(datGenes, by=c('ID' = 'ensembl_gene_id')) %>% dplyr::select(ID, hgnc_symbol, sd_l1o)
kable(top_genes, caption='Top genes with the highest sd of leave 1 out sd')
| ID | hgnc_symbol | sd_l1o |
|---|---|---|
| ENSG00000198804 | MT-CO1 | 4226.2426 |
| ENSG00000118785 | SPP1 | 937.9286 |
| ENSG00000198938 | MT-CO3 | 890.2146 |
| ENSG00000198712 | MT-CO2 | 714.6016 |
| ENSG00000131095 | GFAP | 705.7251 |
| ENSG00000198886 | MT-ND4 | 600.7525 |
| ENSG00000110436 | SLC1A2 | 486.6570 |
| ENSG00000197971 | MBP | 425.5593 |
| ENSG00000198786 | MT-ND5 | 363.9253 |
| ENSG00000198727 | MT-CYB | 362.5641 |
| ENSG00000087086 | FTL | 283.8756 |
| ENSG00000086570 | FAT2 | 253.8446 |
| ENSG00000198888 | MT-ND1 | 239.5533 |
| ENSG00000198899 | MT-ATP6 | 232.7214 |
| ENSG00000123560 | PLP1 | 229.9541 |
| ENSG00000131018 | SYNE1 | 209.3978 |
| ENSG00000198668 | CALM1 | 191.2898 |
| ENSG00000075624 | ACTB | 191.0754 |
| ENSG00000198840 | MT-ND3 | 174.4450 |
| ENSG00000163046 | ANKRD30BL | 154.7978 |
| ENSG00000134982 | APC | 154.4011 |
plot_function = function(i){
i = 3*i-2
plot_list = list()
for(j in 1:3){
plot_data = data.frame('sample' = colnames(datExpr), 'expr' = t(datExpr[top_genes$ID[i+j-1],]), 'Diagnosis' = datMeta$Diagnosis)
colnames(plot_data)[2] = 'expr'
plot_list[[j]] = ggplotly(plot_data %>% ggplot(aes(sample, expr, color=Diagnosis)) +
geom_point() + theme_minimal() +
theme(legend.position='none', axis.text.x=element_blank(), axis.ticks.x=element_blank()))
}
p = subplot(plot_list, nrows=1) %>% layout(annotations = list(
list(x = 0.1 , y = 1.05, text = top_genes$hgnc_symbol[i], showarrow = F, xref='paper', yref='paper'),
list(x = 0.5 , y = 1.05, text = top_genes$hgnc_symbol[i+1], showarrow = F, xref='paper', yref='paper'),
list(x = 0.9 , y = 1.05, text = top_genes$hgnc_symbol[i+2], showarrow = F, xref='paper', yref='paper')))
return(p)
}
plot_function(1)
plot_function(2)
plot_function(3)
plot_function(4)
plot_function(5)
plot_function(6)
plot_function(7)
max_z_score = data.frame('ID' = rownames(datExpr), 'z_score' = apply(datExpr,1,function(x) max((x-mean(x))/sd(x))),
'outlier_sample' = apply(datExpr, 1, function(x) datMeta$Sample_ID[which.max(abs(x-mean(x))/sd(x))])) %>%
left_join(DE_info, by='ID')
ENSG00000187951_val = max_z_score[max_z_score$ID == 'ENSG00000187951',2]
cat(paste0('Gene ENSG00000187951 has a value of ', round(ENSG00000187951_val,1)))
## Gene ENSG00000187951 has a value of 8.8
cat(paste0('There are ', sum(max_z_score$z_score>ENSG00000187951_val), ' genes with a value higher than this (~',
round(100*mean(max_z_score$z_score>ENSG00000187951_val),2),'%)'))
## There are 3 genes with a value higher than this (~0.02%)
summary(max_z_score$z_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.568 2.676 3.181 3.475 4.003 8.819
p = max_z_score %>% ggplot(aes(ID, z_score, color = DE)) + geom_point(alpha=0.3) +
geom_hline(yintercept = ENSG00000187951_val, color='gray') +
xlab('Genes') + ylab('Max Z-score') + theme_minimal() +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank(),
legend.position='none', panel.grid.major = element_blank())
ggExtra::ggMarginal(p, type = 'density', margins='y', groupColour=TRUE, groupFill=TRUE, size=10)
Top 20 genes
top_genes = max_z_score %>% dplyr::top_n(n=21, w=z_score) %>% arrange(desc(z_score)) %>%
left_join(datGenes, by=c('ID' = 'ensembl_gene_id')) %>%
dplyr::select(ID, hgnc_symbol, log2FoldChange, padj, DE, z_score)
kable(top_genes, caption='Top 20 genes with the highest max z-score value')
| ID | hgnc_symbol | log2FoldChange | padj | DE | z_score |
|---|---|---|---|---|---|
| ENSG00000086570 | FAT2 | 1.2019142 | 0.0020615 | TRUE | 8.819450 |
| ENSG00000124493 | GRM4 | 0.1435437 | 0.2984922 | FALSE | 8.806634 |
| ENSG00000181541 | MAB21L2 | 0.5391928 | 0.3724211 | FALSE | 8.803135 |
| ENSG00000187951 | ARHGAP11B | NA | NA | NA | 8.801469 |
| ENSG00000180347 | CCDC129 | 0.1538918 | 0.7154404 | FALSE | 8.787919 |
| ENSG00000164220 | F2RL2 | 0.6499642 | 0.0886975 | FALSE | 8.752194 |
| ENSG00000196092 | PAX5 | 0.6662045 | 0.0779792 | FALSE | 8.746626 |
| ENSG00000123572 | NRK | 0.3115349 | 0.4525888 | FALSE | 8.729920 |
| ENSG00000165646 | SLC18A2 | -0.4914993 | 0.2622890 | FALSE | 8.691294 |
| ENSG00000074370 | ATP2A3 | 0.4635054 | 0.0030267 | TRUE | 8.673706 |
| ENSG00000214694 | ARHGEF33 | 0.1401980 | 0.1035356 | FALSE | 8.663063 |
| ENSG00000187017 | ESPN | 0.2351648 | 0.6632260 | FALSE | 8.649915 |
| ENSG00000132130 | LHX1 | -0.0584408 | 0.9152006 | FALSE | 8.646916 |
| ENSG00000187595 | ZNF385C | 0.4673824 | 0.0044824 | TRUE | 8.631137 |
| ENSG00000171219 | CDC42BPG | 0.5819641 | 0.1224602 | FALSE | 8.602846 |
| ENSG00000066230 | SLC9A3 | -0.1463554 | 0.7968300 | FALSE | 8.507227 |
| ENSG00000108001 | EBF3 | 0.5071822 | 0.2508842 | FALSE | 8.493193 |
| ENSG00000129277 | CCL4 | 0.0842550 | 0.8553859 | FALSE | 8.470016 |
| ENSG00000133063 | CHIT1 | -0.2575575 | 0.4639968 | FALSE | 8.463125 |
| ENSG00000221818 | EBF2 | 0.1693046 | 0.7626034 | FALSE | 8.459121 |
| ENSG00000129167 | TPH1 | 0.0902837 | 0.8168627 | FALSE | 8.449049 |
plot_function(1)
plot_function(2)
plot_function(3)
plot_function(4)
plot_function(5)
plot_function(6)
plot_function(7)
Most outliers seem to come from the same samples
Most of the samples belong to the ASD group
The top two samples come from the same subject, same happens with samples 5 and 6
Taking all the genes that have at least one sample with a z-score larger than 6
samples_info = table(max_z_score$outlier_sample[max_z_score$z_score>6]) %>% data.frame %>% filter(Freq>0) %>%
rename(Var1 = 'Sample_ID', Freq = 'Count') %>% left_join(datMeta, by='Sample_ID') %>%
dplyr::select(Sample_ID, Brain_lobe, Sex, Age, Batch, PMI, Diagnosis, Count) %>% arrange(desc(Count))
cat(paste0(sum(max_z_score$z_score>6), ' genes have a max z-score larger than 6'))
## 591 genes have a max z-score larger than 6
kable(samples_info, caption = 'Samples with the most outliers considering genes with a z-score larger than 6')
| Sample_ID | Brain_lobe | Sex | Age | Batch | PMI | Diagnosis | Count |
|---|---|---|---|---|---|---|---|
| AN02987_BA38 | Temporal | M | 15 | 10.10.2014 | 30.80 | ASD | 158 |
| AN02987_BA7 | Parietal | M | 15 | 10.10.2014 | 30.80 | ASD | 101 |
| AN11796_BA38 | Temporal | M | 14 | 10.10.2014 | 2.35 | ASD | 81 |
| AN17515_BA38 | Temporal | M | 54 | 10.10.2014 | 28.25 | ASD | 62 |
| AN00493_BA17 | Occipital | M | 27 | 10.10.2014 | 8.30 | ASD | 40 |
| AN00493_BA4_6 | Frontal | M | 27 | 6.20.2014 | 8.30 | ASD | 35 |
| AN17254_BA38 | Temporal | M | 51 | 10.10.2014 | 22.16 | ASD | 27 |
| AN00764_BA4_6 | Frontal | M | 20 | 10.10.2014 | 23.70 | ASD | 20 |
| AN04166_BA38 | Temporal | M | 24 | 6.20.2014 | 18.51 | ASD | 13 |
| AN01971_BA17 | Occipital | M | 39 | 10.10.2014 | 31.50 | ASD | 8 |
| AN17254_BA7 | Parietal | M | 51 | 10.10.2014 | 22.16 | ASD | 8 |
| AN08792_BA7 | Parietal | M | 30 | 10.10.2014 | 20.30 | ASD | 7 |
| AN12457_BA17 | Occipital | F | 29 | 6.20.2014 | 17.83 | ASD | 7 |
| AN08166_BA4_6 | Frontal | M | 28 | 10.10.2014 | 43.25 | ASD | 4 |
| AN07176_BA4_6 | Frontal | M | 21 | 10.10.2014 | 29.91 | CTL | 3 |
| AN08166_BA7 | Parietal | M | 28 | 10.10.2014 | 43.25 | ASD | 2 |
| AN09730_BA17 | Occipital | M | 22 | 10.10.2014 | 25.00 | ASD | 2 |
| AN13872_BA4_6 | Frontal | F | 5 | 6.20.2014 | 33.00 | ASD | 2 |
| AN00764_BA17 | Occipital | M | 20 | 10.10.2014 | 23.70 | ASD | 1 |
| AN04166_BA7 | Parietal | M | 24 | 10.10.2014 | 18.51 | ASD | 1 |
| AN07444_BA4_6 | Frontal | M | 17 | 10.10.2014 | 30.75 | CTL | 1 |
| AN08043_BA17 | Occipital | F | 52 | 6.20.2014 | 39.15 | ASD | 1 |
| AN08792_BA38 | Temporal | M | 30 | 6.20.2014 | 20.30 | ASD | 1 |
| AN08873_BA17 | Occipital | M | 5 | 10.10.2014 | 25.50 | ASD | 1 |
| AN08873_BA7 | Parietal | M | 5 | 10.10.2014 | 25.50 | ASD | 1 |
| AN10723_BA38 | Temporal | M | 60 | 10.10.2014 | 24.23 | CTL | 1 |
| AN17425_BA17 | Occipital | M | 16 | 10.10.2014 | 26.16 | CTL | 1 |
| AN19511_BA38 | Temporal | M | 8 | 6.20.2014 | 22.20 | ASD | 1 |
| AN19760_BA4_6 | Frontal | M | 28 | 6.20.2014 | 23.25 | CTL | 1 |
There is not a big difference in the average z-score of the genes in the outlier samples, the most notable difference is in its outliers
plot_data = data.frame('ID' = rownames(datExpr))
outlier_samples = samples_info$Sample_ID[samples_info$Count>5]
for(s in outlier_samples){
outlier_idx = which(datMeta$Sample_ID==s)
z_scores = apply(datExpr, 1, function(x) (abs(x[outlier_idx]-mean(x)))/sd(x))
plot_data = cbind(plot_data, z_scores)
}
colnames(plot_data)[-1] = as.character(outlier_samples)
set.seed(123)
average_sample = sample(datMeta$Sample_ID[! datMeta$Sample_ID %in% outlier_samples],1) %>% as.character
cat(paste0('Using random sample ', average_sample, ' as a reference'))
## Using random sample AN09730_BA4_6 as a reference
random_sample_idx = which(datMeta$Sample_ID==average_sample)
plot_data_melt = plot_data %>% mutate('Random Sample'=apply(datExpr, 1, function(x) abs(x[random_sample_idx]-mean(x))/sd(x))) %>%
melt() %>% left_join(datMeta %>% dplyr::select(Sample_ID, Diagnosis), by = c('variable'='Sample_ID')) %>%
mutate(variable=factor(variable, levels=c(unique(as.character(samples_info$Sample_ID)),'Random Sample'), ordered=T)) %>%
mutate(Diagnosis = ifelse(variable=='Random Sample', as.character(datMeta$Diagnosis[datMeta$Sample_ID==average_sample]), as.character(Diagnosis)))
ggplotly(plot_data_melt %>% ggplot(aes(variable, value, fill=Diagnosis)) + geom_boxplot() + xlab('Samples') +
ylab('|z-scores|') + theme_minimal() + theme(axis.text.x = element_text(angle = 90, hjust = 1)))
rm(plot_data, plot_data_melt, s, outlier_idx, z_scores, average_sample, random_sample_idx)
Even though most of the outliers come from the same group of samples, they weren’t filtered out when filtering outlier samples
datExpr_filtered = datExpr
datMeta_filtered = datMeta
This was the plot used to filter outlier samples. Some of the outlier samples we have found are close to the threshold of -2, but some aren’t, os they wouldn’t have been filtered out even if the threshold would have been higher
# Load original expression datasets
datExpr = read.csv('./../Data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../Data/RNAseq_ASD_datMeta.csv')
datMeta = datMeta %>% mutate(Brain_Region = as.factor(Region)) %>%
mutate(Brain_lobe = case_when(Brain_Region == 'BA4_6' ~ 'Frontal',
Brain_Region == 'BA7' ~ 'Parietal',
Brain_Region == 'BA38' ~ 'Temporal',
TRUE ~ 'Occipital')) %>%
mutate(Batch = as.factor(gsub('/', '.', RNAExtractionBatch)),
Diagnosis = factor(Diagnosis_, levels=c('CTL','ASD'))) %>%
dplyr::select(-Diagnosis_)
# Filtering genes: These filters would be the same, so I'll just keep the genes present in the
# filtered dataset instead of repeating the whole filtering
datExpr = datExpr[rownames(datExpr) %in% rownames(datExpr_filtered),]
# Filtering samples
absadj = datExpr %>% bicor %>% abs
## alpha: 1.000000
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))
# Plot results
plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$Sample_ID,
'Subject_ID'=datMeta$Subject_ID, 'Extraction_Batch'=datMeta$RNAExtractionBatch,
'Brain_Lobe'=datMeta$Brain_lobe, 'Sex'=datMeta$Sex, 'Age'=datMeta$Age,
'Diagnosis'=datMeta$Diagnosis, 'PMI'=datMeta$PMI) %>%
mutate(outlier = Sample_ID %in% samples_info$Sample_ID[samples_info$Count>10]) %>%
left_join(samples_info %>% dplyr::select(Sample_ID, Count), by='Sample_ID') %>%
dplyr::rename('Number of Outliers' = Count) %>%
mutate(`Number of Outliers` = ifelse(is.na(`Number of Outliers`), 0, `Number of Outliers`))
ggplotly(plot_data %>% ggplot(aes(sample, distance, color=Diagnosis)) +
geom_point(alpha=plot_data$outlier/2+.2, aes(id=Subject_ID)) +
geom_hline(yintercept = -2, color = 'gray') + theme_minimal() +
ggtitle('Original position of the outlier genes wrt all the other samples'))
selectable_scatter_plot(plot_data, plot_data[,-c(1,2)])